Goals

  1. Perform differential abundance analysis on all cell populations identified from BAL flow by Sasha to identify cell expansion/depletion associated with COVID-19

Setup

Load packages

library(ggplot2)
library(googlesheets4)
library(googledrive)
library(tidyverse)
library(summarytools)
library(car)
library(Hmisc)
library(readxl)
library(ggfortify)
library(flowCore)
library(lubridate)
library(reshape2)
library(pheatmap)
library(RColorBrewer)
library(ggwordcloud)
library(tm)
library(factoextra)
library(uwot)
library(igraph)

mean_sd = function(x){
  return(mean_sdl(x, mult = 1))
}

set.seed(12345)

Google login

drive_auth(use_oob = T, cache = T, email = "rogangrant2022@u.northwestern.edu") # have to run in console :(
gs4_auth(token = drive_token(), use_oob = T, cache = T, email = "rogangrant2022@u.northwestern.edu")

Import Sasha’s data analysis results

data = read_sheet("https://docs.google.com/spreadsheets/d/1KHgJ-ZXQAfgwp-X1U2xjOiYEa--YrR6cGV3U20TxTXo/edit?usp=sharing",
                  skip = 0, 
                  trim_ws = T,
                  .name_repair = "universal",
                  sheet = "2020-6-14_update")
## Reading from "COVID19 BAL stats"
## Range "'2020-6-14_update'"
#remove in-progress entries
data = subset(data, !is.na(Sample))

#mark neutrophilic patients
data$neutrophilic = data$percent_neutrophils > 40

#for subsetting later
observations = table(data$ID)
serial_patients = names(observations[observations > 1])

#adjust types
data$ID = as.character(data$ID)

Import RNA extraction metadata

rna_metadata = read_sheet("https://docs.google.com/spreadsheets/d/15aq9UMRtTl5E75E5M0mXvbFfxVVuJxrXneKYpo6LzlE/edit#gid=897409272",
                  trim_ws = T,
                  .name_repair = "universal", 
                  sheet = "SCRIPT_01 RNAseq Metadata",
                  na = c("", "NA"))
## Reading from "SCRIPT_01_RNAseq"
## Range "'SCRIPT_01 RNAseq Metadata'"
## New names:
## * `` -> ...17
## * `` -> ...19
## * `Kit lot number` -> Kit.lot.number
## * `Elution vol (ul)` -> Elution.vol..ul.
## * `Date of RNA QC` -> Date.of.RNA.QC
## * ...
#clean up colnames
colnames(rna_metadata) = gsub("\\.", "_", colnames(rna_metadata)) #I like underscores
colnames(rna_metadata) = gsub("_+$", "", colnames(rna_metadata)) # remove trailing
colnames(rna_metadata) = gsub("_+", "_", colnames(rna_metadata)) #remove dup underscores
#tube is randomly read in as a list. fix this.
rna_metadata$tubeid_macrophRNA = as.character(rna_metadata$tubeid_macrophRNA)
rna_metadata$tubeid_macrophRNA[rna_metadata$tubeid_macrophRNA == "NULL"] = NA
#take out problem cols (added back)
rna_metadata = rna_metadata %>% 
  dplyr::select(-c(RNAseq_batch, Batch_sample))
#remove preceding zeros
rna_metadata = rna_metadata %>% 
  mutate_at("tubeid_macrophRNA", function(x){ gsub("^0", "", x) })
#fix dates
rna_metadata$sample_process_dt = as.Date(rna_metadata$sample_process_dt)

#merge with flow data
data$BAL_sample = substring(data$Sample, 10, 20)
data = left_join(data, 
                 rna_metadata, 
                 by = c("BAL_sample" = "tc_pt_study_id"))

Import clinical metadata

# simple_md = read_excel(path = "~/Box/COVID19_BAL_flow/01_data/02_clinical_metadata/extracted_clinical_data/LMN_extracted_clinical_data_update052020.xlsx",
#                        sheet = "New list",
#                        .name_repair = "universal")
# simple_md$COVID.status = factor(simple_md$COVID.status)
# simple_md$outcome = factor(ifelse(grepl("deceased", simple_md$Discharged..d.c..or.deceased),
#                                   yes = "deceased",
#                                   no = "discharged"))
# simple_md$Study.ID = factor(as.character(simple_md$Study.ID))
# 
# source("~/Documents/GitHub/COVID19_BAL_flow/rgrant/read_clinical_metadata_covid19.R")
# md = read_clinical_metadata_covid19()
# md = subset(md, grepl("\\d{4}-BAL-\\d{2}", Sample..)) #collected samples follow this format
# 
edw_endpoints = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT Basic Endpoints.csv",
                         strip.white = T,
                         check.names = T,
                         na.strings = c("", "NA"))
date_cols = colnames(edw_endpoints)[grepl("date", colnames(edw_endpoints), ignore.case = T) |
                                    grepl("\\_dt", colnames(edw_endpoints))]
edw_endpoints = edw_endpoints %>%
  mutate_at(.vars = date_cols,
            .funs = function(x){
              as.Date(x, format = "%m/%d/%y %H:%M", tz = "CST") })
edw_endpoints$pt_study_id = as.character(edw_endpoints$pt_study_id)

#simplify outcome data
edw_endpoints$binned_outcome = factor(vapply(edw_endpoints$discharge_disposition_name,
                                      function(x)
                                      {
                                        if(x == "Expired")
                                        {
                                          return("Deceased")
                                        } else if(grepl("^Home", x))
                                        {
                                          return("Discharged")
                                        } else if(x %in% c("Acute Care Hospital", "Group Home", "Inpatient Hospice",
                                                           "Planned Readmission – DC/transferred to acute inpatient rehab",
                                                           "Acute Inpatient Rehabilitation", "Long-Term Acute Care Hospital (LTAC)",
                                                           "Skilled Nursing Facility or Subacute Rehab Care"))
                                        {
                                          return("Inpatient Facility")
                                        } else if(is.na(x) | x == "unknown")
                                        {
                                          return(as.character(NA))
                                        } else
                                        {
                                          return("Other")
                                        }}, FUN.VALUE = "char"))
edw_endpoints = edw_endpoints %>% 
  select(-full_name)

# edw_micro = read_excel("~/Box/RGrant/SCRIPT/200526 SCRIPT Microbiology BAL Results.xlsx",
#                        skip = 7,
#                        .name_repair = "universal")
# keep_cols = apply(edw_micro, 2, function(x){ !all(is.na(x)) })
# edw_micro = edw_micro[, keep_cols]
# edw_micro$order.datetime = as.Date(edw_micro$order.datetime, origin = "1899-12-30") #excel date format
# edw_micro = subset(edw_micro, !is.na(order.datetime))
# edw_micro$infection_detected = factor(!is.na(edw_micro$organism.name))
# edw_micro$organism.quantity[grepl(">", edw_micro$organism.quantity)] = "10000"
# edw_micro$organism.quantity = as.numeric(edw_micro$organism.quantity)
# #summarize for easy binding, get rid of garbage info
# edw_micro = edw_micro %>% 
#   group_by(pt.study.id, order.datetime) %>% 
#   dplyr::summarize(detected_organisms = list(organism.name),
#             organism_quantities = list(organism.quantity)) %>%
#   rowwise() %>% 
#   mutate(infection_detected = any(!is.na(detected_organisms)))
# 
# #merge all together
# data$sample_id = substring(data$Sample,
#                            10,
#                            20)
# md$Study.ID = as.character(md$Study.ID)
# #need for joining, fixed after
# edw_micro$order.datetime = as.character(edw_micro$order.datetime)
# md$BAL.Date = as.character(md$BAL.Date)
# data = data %>% 
#   left_join(., md, by = c("sample_id" = "Sample..", "ID" = "Study.ID")) %>% 
#   left_join(., simple_md, by = c("ID" = "Study.ID")) %>% 
#   left_join(., edw_endpoints, by = c("ID" = "pt_study_id")) %>% 
#   left_join(., edw_micro, by = c("ID" = "pt.study.id", "BAL.Date" = "order.datetime"))
# only_metadata = md %>% 
#   left_join(., simple_md, by = "Study.ID") %>% 
#   left_join(., edw_endpoints, by = c("Study.ID" = "pt_study_id")) %>% 
#   left_join(., edw_micro, by = c("Study.ID" = "pt.study.id", "BAL.Date" = "order.datetime"))
# only_metadata = unique(only_metadata)
# data$BAL.Date = as.Date(data$BAL.Date)
# edw_micro$order.datetime = as.Date(edw_micro$order.datetime)

#add EDW molecular data   
edw_molecular = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT and COVID BAL Results.csv",
                         na.strings = c("", "NA"),
                         strip.white = T,
                         check.names = T) %>% 
  select(-Name)
#make numeric values numeric   
numeric_cols = c("day_of_intubation", "day_of_hospitalization", "RBC_BODY_FLUID", "WBC__BODY_FLUID", "NEUTROPHILS__BODY_FLUID",
                 "Absolute_Neutrophils", "TOXIC_GRANULATION", "MACROPHAGE_BF", "MONOCYTE_BF", "LYMPH_BF", 
                 "ABSOLUTE_LYMPHOCYTES", "EOSINOPHILS__BODY_FLUID", "ABSOLUTE_EOSINOPHILS", "PLASMA_CELL_BF",
                 "OTHER_CELLS__BODY_FLUID", "AMYLASE_BF", "WHITE_BLOOD_CELL_COUNT", 
                 "C_Reactive_Protein", "LDH", "CREATINE_KINASE__TOTAL", "PROCALCITONIN", "FERRITIN", "TROPONIN_I",
                 "Creatinine", "AST__SGOT_", "D_DIMER", "max_daily_temp")
edw_molecular = edw_molecular %>% mutate_at(.vars = numeric_cols, .funs = function(x){
  x = gsub(">", "", x)
  x = gsub("<", "", x)
  x = gsub(",", "", x)
  x = as.numeric(x)
  return(x)})
#fix test columns
test_cols = colnames(edw_molecular)[c(which(colnames(edw_molecular) == "ASPERGILLUS_GALACTOMANNAN_ANTIGEN_NMH_LFH_"):which(colnames(edw_molecular) == "RESPIRATORY_SYNCYTIAL_VIRUS_RESPAN22"),
                                      which(colnames(edw_molecular) == "STREPTOCOCCUS_PNEUMONIAE_ANTIGEN_URINE_1"),
                                      which(colnames(edw_molecular) == "LEGIONELLA_ANTIGEN__EIA__URINE_1"))]
                                            
edw_molecular = edw_molecular %>% mutate_at(.vars = test_cols, 
                                            .funs = function(x){
                                              x = factor(ifelse(is.na(x),
                                                                 yes = NA,
                                                                 no = ifelse((grepl("Not Detected", x, ignore.case = T) |
                                                                                grepl("Negative", x, ignore.case = T)),
                                                                             yes = "Negative",
                                                                             no = "Positive")))
                                                         return(x) })

#remove one strange duplicate entry
edw_molecular = subset(edw_molecular, !(duplicated(paste(edw_molecular$ir_id, edw_molecular$BAL_order_timestamp))))
#format into long form
edw_molecular = edw_molecular %>% 
  pivot_longer(cols = contains("organism_"),
               names_to = "microbiology_parameter",
               values_to = "microbiology_value") 
edw_molecular$main_microbiology_parameter = factor(gsub("_*\\d", "", edw_molecular$microbiology_parameter))
#flatten these parameters into lists of values
edw_molecular = edw_molecular %>%
  group_by(ir_id, BAL_order_timestamp, main_microbiology_parameter) %>%
  mutate(microbiology_value_list = list(microbiology_value)) %>%  # list column
  ungroup() %>% 
  rowwise() %>% 
  mutate_at(.vars = "microbiology_value_list", .funs = function(x){ #remove NA
   cur = na.omit(x)
   if(length(cur) == 0)
   {
     return(list(NULL))
   } else
   {
    return(list(cur))
   }
  }) %>% 
  select(-c(microbiology_parameter, microbiology_value)) %>% 
  ungroup()
#pivot back into wide form for merging (list values get duplicated, need to fix)
listcols = as.character(unique(edw_molecular$main_microbiology_parameter))
edw_molecular = edw_molecular %>%
  pivot_wider(names_from = main_microbiology_parameter,
              values_from = microbiology_value_list) %>% 
  rowwise() %>% 
  #have to remove duplicated list vals
  mutate_at(.vars = listcols, .funs = function(x){ 
              return(list(x[[1]])) }) %>% 
  ungroup()
edw_molecular$order_accession_num = as.character(edw_molecular$order_accession_num)
#fix dates
date_cols = colnames(edw_molecular)[grepl("date", colnames(edw_molecular), ignore.case = T)]
edw_molecular = edw_molecular %>% 
  mutate_at(.vars = date_cols, .funs = function(x){
    as.Date(x, format = "%m/%d/%y", tz = "CST")} )
#make organism quantity numeric
edw_molecular$organism_quantity = lapply(edw_molecular$organism_quantity,
                                         function(x){
                                           x = gsub(">", "", x)
                                           x = gsub("<", "", x)
                                           x = gsub(",", "", x)
                                           x = as.numeric(x)
                                           if(length(x) == 0)
                                           {
                                             return(NULL)
                                           }
                                           return(x)})
                                           

#import BAL data   
edw_BAL = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT BAL Related Labs.csv",
                   na.strings = c("", "NA"),
                         strip.white = T,
                         check.names = T) %>% 
  select(c(ir_id, pt_study_id, redcap_bal_dt)) %>% #these columns aren't helpful
  unique()
edw_BAL$pt_study_id = as.character(edw_BAL$pt_study_id)
edw_BAL$redcap_bal_dt = as.Date(edw_BAL$redcap_bal_dt)
colnames(edw_BAL)[colnames(edw_BAL) == "bal_order_date"] = "BAL_order_date"

#import known COVID patients and all patient IDs
covid_cases = read.csv("~/Box/RGrant/SCRIPT/200608 SCRIPT_COVID_list.csv",
                        na.strings = c("", "NA"),
                         strip.white = T,
                         check.names = T,
                       colClasses = rep("character", 5))
covid_cases$covid_confirmed = T
all_patients = read.csv("~/Box/RGrant/SCRIPT/STU00204868_subjects_06_04_2020.csv",
                        na.strings = c("", "NA"),
                         strip.white = T,
                         check.names = T, 
                        colClasses = rep("character", 10)) %>% 
  separate_rows(case.number, sep = ", ") #uncollapse ID column
patient_data = full_join(covid_cases,
                         all_patients,
                         by = c("clarity_west_mrn" = "nmhc_record_number")) %>% 
  select(-c(first_name, last_name, address.line.1:email, nmff_record_number, nmh_record_number,
            first.name, last.name)) #some are duplicated
colnames(patient_data)[colnames(patient_data) == "case.number"] = "study_id"

#merge metadata
edw_molecular$ir_id = as.character(edw_molecular$ir_id)
edw_endpoints$ir_id = as.character(edw_endpoints$ir_id)
only_metadata = left_join(edw_molecular, 
                          patient_data,
                          by = c("ir_id")) %>%
  select(-MRN) %>% 
  full_join(.,
               edw_endpoints,
               by = c("ir_id", "study_id" = "pt_study_id")) %>% 
  select(-west_mrn)

#fix colnames
colnames(data) = gsub("\\.", "_", colnames(data)) #I like underscores
colnames(data) = gsub("_+$", "", colnames(data)) # remove trailing
colnames(data) = gsub("_+", "_", colnames(data)) #remove dup underscores
colnames(only_metadata) = gsub("\\.", "_", colnames(only_metadata)) 
colnames(only_metadata) = gsub("_+$", "", colnames(only_metadata)) 
colnames(only_metadata) = gsub("_+", "_", colnames(only_metadata))

#derive additional metadata
only_metadata$days_from_death = as.numeric(difftime(only_metadata$BAL_order_date, only_metadata$death_date, units = "days"))
only_metadata$covid_confirmed[is.na(only_metadata$covid_confirmed)] = FALSE #may be a safer way to do this
only_metadata$covid_confirmed = factor(only_metadata$covid_confirmed)
only_metadata$ventilator_days = as.numeric(difftime(only_metadata$BAL_order_date, only_metadata$first_intubation_date, units = "days"))

#merge with flow data
merge_date = rep(NA, nrow(data))
for(i in 1:nrow(data))
{
  #get patient id and bal order date for each entry
  cur = data[i, ]
  patient = cur$ID
  sample_process_dt = cur$sample_process_dt
  latest = as.Date(sample_process_dt)
  earliest = as.Date(sample_process_dt - 1) #24hr window
  
  #perform matching (should just be one match per)
  matches = subset(only_metadata, 
                   study_id == patient & BAL_order_date >= earliest & BAL_order_date <= latest)
  if(nrow(matches) == 0)
  {
    warning(paste("Unmatched sample warning. Patient:", patient, "Sort date:", sample_process_dt))
    next
  } else if(nrow(matches) > 1)
  {
    stop(paste("Error: mutliple matches for single sample. Patient:", patient, "Sort date:", sample_process_dt))
  } else
  {
    merge_date[i] = as.character(matches$BAL_order_date)
  }
}
data = cbind(data, merge_date)
data$merge_date = as.Date(data$merge_date)

data = left_join(data,
                 only_metadata,
                 by = c("ID" = "study_id", "merge_date" = "BAL_order_date"))

#cast into long form
data = unique(data)
mfi_cols = colnames(data)[grepl("MFI", colnames(data), ignore.case = T)]
percentage_cols = colnames(data)[grepl("percent", colnames(data), ignore.case = T)]
data_long = data %>% 
  pivot_longer(cols = c(mfi_cols, percentage_cols), 
                         names_to = "flow_parameter",
                         values_to = "value") %>% 
  arrange(ID, ventilator_days) #for easy viewing later
data_long$flow_parameter = factor(data_long$flow_parameter)

#output for use with bulk
outname = paste0("~/Box/RGrant/SCRIPT/",
                 Sys.Date(),
                 "_",
                 "SCRIPT_clinical_metadata_processed.rds")
saveRDS(object = only_metadata,
          file = outname)
outname = paste0("~/Box/RGrant/SCRIPT/",
                 Sys.Date(),
                 "_",
                 "SCRIPT_flow_plus_clinical_metadata_processed.rds")
saveRDS(object = data,
          file = outname)

data

Summary stats

Metadata

summary_data = only_metadata %>% 
  select(ir_id:OTHER_CELLS_BODY_FLUID,
         AMYLASE_BF,
         ASPERGILLUS_GALACTOMANNAN_ANTIGEN_NMH_LFH:D_DIMER,
         max_daily_temp,
         nmh_mrn:binned_outcome)
dfSummary(summary_data, plain.ascii = FALSE, style = "multiline", 
          graph.magnif = 0.75, valid.col = FALSE, tmp.img.dir = "/tmp", justify = "c")

Distributions

Percentages

percentage_data = subset(data_long, grepl("percent", data_long$flow_parameter))
shapiro.test(percentage_data$value)
## 
##  Shapiro-Wilk normality test
## 
## data:  percentage_data$value
## W = 0.74261, p-value < 2.2e-16
hist(percentage_data$value, breaks = 50)

MFIs

mfi_data = subset(data_long, grepl("MFI", data_long$flow_parameter))
shapiro.test(mfi_data$value)
## 
##  Shapiro-Wilk normality test
## 
## data:  mfi_data$value
## W = 0.62702, p-value < 2.2e-16
hist(mfi_data$value, breaks = 50)

Extremely non-normal in both cases. At the very least, we need nonparametric tests. In reality, we probably need non-parametric tests or beta regression. However, this may not be true of individual parameters. First let’s see if transformations help.

Progressions

serial_data = subset(percentage_data, ID %in% serial_patients & 
                       !(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(serial_data, aes(x = ventilator_days, y = value, fill = flow_parameter)) +
  geom_bar(position = "stack", stat = "identity") +
  facet_grid(ID ~ .) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 448 rows containing missing values (position_stack).

# ggplot(serial_data, aes(x = days_from_death, y = value, fill = flow_parameter)) +
#   geom_bar(position = "stack", stat = "identity") +
#   facet_grid(ID ~ .) +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ggplot(serial_data, aes(x = days_from_extubation, y = value, fill = flow_parameter)) +
#   geom_bar(position = "stack", stat = "identity") +
#   facet_grid(ID ~ .) +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1))

As expected, there appear to be neutrophilic and non-neutrophilic trajectories captured here.

Plot contributions (just day 0 for now)

day0_percentages = subset(percentage_data, ventilator_days == 0 &
                            !(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(day0_percentages, aes(x = ID, y = value, fill = flow_parameter)) +
  geom_bar(position = "stack", stat = "identity") +
  facet_grid(. ~ covid_confirmed, scales = "free_x") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Transformations

Percentages

Arcsine

hist(asin(sqrt(percentage_data$value)), breaks = 50)
## Warning in asin(sqrt(percentage_data$value)): NaNs produced

shapiro.test(asin(sqrt(percentage_data$value)))
## Warning in asin(sqrt(percentage_data$value)): NaNs produced
## 
##  Shapiro-Wilk normality test
## 
## data:  asin(sqrt(percentage_data$value))
## W = 0.97364, p-value = 1.15e-05

Getting there!

Logit

hist(car::logit(percentage_data$value), breaks = 50)
## Warning in car::logit(percentage_data$value): proportions remapped to (0.025,
## 0.975)

shapiro.test(car::logit(percentage_data$value))
## Warning in car::logit(percentage_data$value): proportions remapped to (0.025,
## 0.975)
## 
##  Shapiro-Wilk normality test
## 
## data:  car::logit(percentage_data$value)
## W = 0.89537, p-value < 2.2e-16

This is getting very close. Let’s see if individual observations are normal.

Individual parameters

percentage_data$logit = car::logit(percentage_data$value)
## Warning in car::logit(percentage_data$value): proportions remapped to (0.025,
## 0.975)
percentage_data %>%
  group_by(flow_parameter) %>% 
  dplyr::summarize(pval = shapiro.test(logit)$p.value)   
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(percentage_data, aes(x = logit)) +
  geom_histogram() +
  facet_wrap(~ flow_parameter)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We can probably work with this, actually. Ideally we should talk to a statistician at some point before publication. Use normalized values for all stats.

MFIs

Log transform

hist(log10(mfi_data$value), breaks = 50)
## Warning in hist(log10(mfi_data$value), breaks = 50): NaNs produced

shapiro.test(log10(mfi_data$value))
## Warning in stopifnot(is.numeric(x)): NaNs produced
## 
##  Shapiro-Wilk normality test
## 
## data:  log10(mfi_data$value)
## W = 0.97248, p-value = 1.942e-13

Trimodal? But within individual parameters this should be okay. .

Individual parameters

mfi_data$log10 = log10(mfi_data$value)
## Warning: NaNs produced
mfi_data %>%
  group_by(flow_parameter) %>% 
  dplyr::summarize(pval = shapiro.test(log10)$p.value)   
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(mfi_data, aes(x = log10)) +
  geom_histogram() +
  facet_wrap(~ flow_parameter)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Looks pretty good.

Analysis

MANCOVA (percentages)

data_wide_percentage = percentage_data %>% 
  pivot_wider(names_from = flow_parameter, 
              values_from = c(logit, value))
data_wide_percentage$covid_confirmed[is.na(data_wide_percentage$covid_confirmed)] = FALSE
data_wide_percentage$covid_confirmed = factor(data_wide_percentage$covid_confirmed)
data_wide_percentage$response = data_wide_percentage %>% 
  select(logit_percent_CD4_total:logit_percent_others)
mancova_data = data_wide_percentage %>% 
  select(c(logit_percent_CD4_total:logit_percent_others, 
           covid_confirmed, ventilator_days)) %>% 
  na.omit()
mancova_fit = manova(cbind(logit_percent_CD4_total, logit_percent_CD4_Tregs, logit_percent_CD4_non_Tregs,
                           logit_percent_CD8_total, logit_percent_CD206_total,  logit_percent_macs_CD206_high,
                           logit_percent_total_CD206_high,  logit_percent_macs_CD206_low, logit_percent_total_CD206_low,
                           logit_percent_neutrophils, logit_percent_monocytes, logit_percent_others) 
                     ~ covid_confirmed * Outcomes,
                     na.action = "na.omit",
                     data = mancova_data) #add outcomes??
summary.aov(mancova_fit)

These are interesting results! With the exception of monocytes, abundance of all cells is affected by COVID-19 in some way. For CD4T, and CD8T, this appears to be purely a main effect. For Tregs and putative MoAM, there is also an interaction effect of day and infection. For TRAM, there is only an interaction effect.

Visualize

Area plot

area_data = percentage_data %>% 
  group_by(covid_confirmed, ventilator_days, flow_parameter) %>% 
  dplyr::summarize(mean_pct = mean(value))
## `summarise()` regrouping output by 'covid_confirmed', 'ventilator_days' (override with `.groups` argument)
area_data = subset(area_data, !(flow_parameter %in% c("percent_CD4_total", "percent_CD206_total", "percent_total_CD206_low", "percent_total_CD206_high")))
ggplot(area_data, aes(x = ventilator_days, y = mean_pct, fill = flow_parameter)) +
  geom_area(alpha = 0.7) +
  facet_grid(covid_confirmed ~ .)
## Warning: Removed 16 rows containing missing values (position_stack).

This give a false impression between samples. A line plot may actually be better.

Line plot

Grouped

ggplot(percentage_data, aes(x = ventilator_days, y = value, color = flow_parameter)) +
  stat_summary(fun = mean, geom = "line") +
  stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.5, alpha = 0.5) +
  facet_grid(binned_outcome ~ covid_confirmed)

By patient

ggplot(percentage_data, aes(x = ventilator_days, y = value, color = flow_parameter)) +
  geom_line() +
  facet_wrap(~ ID)

Not enough data at this point to glean much of anything.

Heatmaps of flow parameters

D0

#cast into matrix of patient x flow parameter
all_day0 = data %>% 
  dplyr::filter(ventilator_days == 0 & (is.na(ventilator_days) | ventilator_days == 0)) %>% 
  select(ID, percent_CD4_total:percent_others, binned_outcome, covid_confirmed)
annotation_data = all_day0 %>% 
  select(ID, binned_outcome, covid_confirmed) %>% 
  remove_rownames() %>% 
  column_to_rownames(var = "ID")

matrix_data = all_day0 %>% 
  select(ID, percent_CD4_total:percent_others) %>% 
  remove_rownames() %>% 
  column_to_rownames(var = "ID") %>% 
  as.matrix()

pheatmap(mat = matrix_data, 
         cluster_rows = T,
         cluster_cols = T,
         clustering_method = "ward.D2",
         annotation_row = annotation_data,
         scale = "column",
         angle_col = 45,
         breaks = seq(-3, 3, length.out=101),
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))

All days

all_days = data %>% 
  dplyr::filter(!is.na(ventilator_days)) %>% 
  select(Sample, percent_CD4_total:percent_others, binned_outcome, covid_confirmed, ID, ventilator_days) %>% 
  arrange(ID, ventilator_days)
annotation_data = all_days %>% 
  select(Sample, binned_outcome, covid_confirmed, ID, ventilator_days) %>% 
  remove_rownames() %>% 
  column_to_rownames(var = "Sample")

matrix_data = all_days %>% 
  select(Sample, percent_CD4_total:percent_others) %>% 
  remove_rownames() %>% 
  column_to_rownames(var = "Sample") %>% 
  as.matrix()

pheatmap(mat = matrix_data, 
         cluster_rows = T,
         cluster_cols = T,
         clustering_method = "ward.D2",
         annotation_row = annotation_data,
         scale = "column",
         angle_col = 45,
         show_rownames = F,
         breaks = seq(-3, 3, length.out=101),
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))

pheatmap(mat = matrix_data, 
         cluster_rows = F,
         cluster_cols = T,
         clustering_method = "ward.D2",
         annotation_row = annotation_data,
         scale = "column",
         angle_col = 45,
         show_rownames = F,
         breaks = seq(-3, 3, length.out=101),
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))

Only serial

all_days = data %>% 
  dplyr::filter(!is.na(ventilator_days) & ID %in% serial_patients) %>% 
  rowwise() %>% 
  mutate(infection_detected = factor(!is.null(organism_quantity))) %>% 
  ungroup() %>% 
  select(Sample, percent_CD4_total:percent_others, binned_outcome, 
         covid_confirmed, ID, ventilator_days,
         C_Reactive_Protein, infection_detected) %>% 
  arrange(ID, ventilator_days)
annotation_data = all_days %>% 
  select(Sample, binned_outcome, covid_confirmed, 
         C_Reactive_Protein, infection_detected, ventilator_days) %>% 
  remove_rownames() %>% 
  column_to_rownames(var = "Sample")

matrix_data = all_days %>% 
  select(Sample, percent_CD4_total:percent_others) %>% 
  remove_rownames() %>% 
  column_to_rownames(var = "Sample") %>% 
  as.matrix()

pheatmap(mat = matrix_data, 
         cluster_rows = T,
         cluster_cols = T,
         clustering_method = "ward.D2",
         annotation_row = annotation_data,
         scale = "column",
         angle_col = 45,
         show_rownames = T,
         breaks = seq(-3, 3, length.out=101),
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))

breaks = which(!duplicated(all_days$ID)) - 1
pheatmap(mat = matrix_data, 
         cluster_rows = F,
         cluster_cols = T,
         clustering_method = "ward.D2",
         annotation_row = annotation_data,
         scale = "column",
         angle_col = 45,
         show_rownames = T,
         gaps_row = breaks,
         breaks = seq(-3, 3, length.out=101),
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))

Co-infection

Proportion of microbe-infected individuals

Plot

only_metadata = only_metadata %>%
  rowwise() %>% 
  mutate(infection_detected = !is.null(organism_name)) %>% 
  ungroup()
only_metadata$infection_detected[is.na(only_metadata$infection_detected)] = "Not Tested"
only_metadata$infection_detected = factor(only_metadata$infection_detected,
                                      levels = c("Not Tested", "TRUE", "FALSE"))
ggplot(subset(only_metadata, !is.na(covid_confirmed)), aes(x = covid_confirmed, fill = infection_detected)) +
  geom_bar(position = "fill") +
  ylab("Proportion") +
  scale_fill_manual(values = c("Not Tested" = "gray", "FALSE" = "cornflowerblue", "TRUE" = "firebrick"))

Levels of infection

Histograms of smear quantities

covid_organism_levels = only_metadata %>% 
  dplyr::filter(covid_confirmed == T) %>% 
  select(organism_quantity) %>% 
  unlist()
other_organism_levels = only_metadata %>% 
  dplyr::filter(covid_confirmed == F) %>% 
  select(organism_quantity) %>% 
  unlist()
ggplot(NULL) +
  geom_density(aes(x = covid_organism_levels), fill = "firebrick", alpha = 0.5) +
  geom_density(aes(x = other_organism_levels), fill = "cornflowerblue", alpha = 0.5)

Infection type

Word cloud

covid_organisms = as.character(only_metadata %>% 
  dplyr::filter(covid_confirmed == T) %>% 
  select(organism_name) %>% 
  unlist() %>% 
  na.omit())
covid_organisms = gsub("[[:punct:]] | | [[:punct:]]|[[:punct:]]", "_", trimws(covid_organisms)) #so we can treat as one word
covid_organisms = gsub("_$", "", covid_organisms)

other_organisms = as.character(only_metadata %>%
                                 dplyr::filter(covid_confirmed == F) %>% 
                                 select(organism_name) %>% 
                                 unlist() %>% 
                                 na.omit())
other_organisms = gsub("[[:punct:]] | | [[:punct:]]|[[:punct:]]", "_", trimws(other_organisms))
other_organisms = gsub("_$", "", other_organisms)

#make into data frame with normalized freqs
df1 = data.frame(word = names(termFreq(covid_organisms)),
                 freq = as.numeric(termFreq(covid_organisms)) / 
                                     sum(as.numeric(termFreq(covid_organisms))),
                 covid_confirmed = T)
df2 = data.frame(word = names(termFreq(other_organisms)),
                 freq = as.numeric(termFreq(other_organisms)) /
                                     sum(as.numeric(termFreq(other_organisms))),
                 covid_confirmed = F)

#finally, plot
wordcould_df = rbind(df1, df2)
ggplot(wordcould_df, aes(label = word, size = freq)) +
  geom_text_wordcloud(show.legend = F) +
  facet_wrap(~ covid_confirmed)
## Warning in wordcloud_boxes(data_points = points_valid_first, boxes = boxes, :
## Some words could not fit on page. They have been placed at their original
## positions.

## Warning in wordcloud_boxes(data_points = points_valid_first, boxes = boxes, :
## Some words could not fit on page. They have been placed at their original
## positions.

Specific organisms

Viridans Streptococcus

metagenomics_data_a = only_metadata %>% 
  unique() %>% 
  unnest_longer(col = organism_name, simplify = T) %>% 
  select(-c(Organism_ID, organism_quantity))
metagenomics_data_b = only_metadata %>% 
  unique() %>% 
  unnest_longer(col = organism_quantity, simplify = T) %>% 
  select(-c(Organism_ID))
metagenomics_data_b$organism_quantity[is.na(metagenomics_data_b$organism_quantity)] = 0
metagenomics_data = cbind(metagenomics_data_a, organism_quantity = metagenomics_data_b$organism_quantity) %>% 
  pivot_wider(names_from = organism_name, 
              values_from = organism_quantity)
rm(metagenomics_data_a, metagenomics_data_b)

viridans_data = metagenomics_data %>% 
  select(ir_id, BAL_order_timestamp, covid_confirmed, 
         `Viridans streptococcus`, `Viridans streptococcus #2`, 
         `Viridans streptococcus #3`) %>% 
  pivot_longer(cols = c(`Viridans streptococcus`, `Viridans streptococcus #2`, `Viridans streptococcus #3`),
               names_to = "strain", values_to = "organism_quantity")
viridans_data$organism_quantity[is.na(viridans_data$organism_quantity)] = 0
ggplot(viridans_data, aes(x = covid_confirmed, y = organism_quantity, fill = strain)) +
  geom_boxplot()

Factor analysis

Subset to usable factors

clinical_data = only_metadata %>% 
  unique() %>% #oddly there is duplication
  dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  select(c(sample_id,
           RBC_BODY_FLUID:Absolute_Neutrophils,
           MACROPHAGE_BF:OTHER_CELLS_BODY_FLUID,
           AMYLASE_BF,
           WHITE_BLOOD_CELL_COUNT:D_DIMER, 
           max_daily_temp)) %>% 
    column_to_rownames(var = "sample_id")

PCA

Calculation

safe_cols = function(x, proportion_present = 0.5)
{
  return(all(is.finite(na.omit(x))) & 
           sd(x, na.rm = T) > 0 &
           (sum(!is.na(x)) / (length(x)) > proportion_present))
}
keep_cols = apply(clinical_data, 2, safe_cols)
pca_data = na.omit(clinical_data[, keep_cols])

#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
                 paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
             data = pca_data, 
             scale. = T,
             na.action = na.omit)

pca_data_complete = only_metadata %>% 
  unique() %>%
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  dplyr::filter(sample_id %in% rownames(pca_data)) %>% 
  column_to_rownames(var = "sample_id")

Screeplot

fviz_eig(pca)

Biplot

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         shape = "covid_confirmed",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 1,
         y = 2)
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         shape = "covid_confirmed",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 2,
         y = 3)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         shape = "covid_confirmed",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 3,
         y = 4)

Some interesting loadings: PC1-2 is driven heavily by markers of sepsis and/or organ failure. PC3 appears to just broadly be immune response, but I find it interesting that CRP and monocytes (in this case from BAL) travel together. Are they the source of IL6? Is this a response to IL6? PC4 is similar, with influence of CRP and ferritin suggesting high levels of inflammation. Let’s also try UMAP to see if there is some hidden structure.

Calculation on fewer parameters

clinical_data_ltd = only_metadata %>% 
  unique() %>% #oddly there is duplication
  dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  select(c(sample_id,
           Absolute_Neutrophils,
           WHITE_BLOOD_CELL_COUNT:D_DIMER, 
           max_daily_temp)) %>% 
    column_to_rownames(var = "sample_id")

keep_cols = apply(clinical_data_ltd, 2, safe_cols)
pca_data = na.omit(clinical_data_ltd[, keep_cols])

#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
                 paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
             data = pca_data, 
             scale. = T,
             na.action = na.omit)

pca_data_complete = only_metadata %>% 
  unique() %>%
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  dplyr::filter(sample_id %in% rownames(pca_data)) %>% 
  column_to_rownames(var = "sample_id")

Biplot

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 1,
         y = 2)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 2,
         y = 3)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 3,
         y = 4)

UMAP

Calculate UMAP

umap = umap(pca_data, 
            n_neighbors = 15,
            min_dist = 0.1,
            scale = "Z")
colnames(umap) = c("umap_1", "umap_2")
umap_data = cbind(pca_data_complete, umap)

Plot

ggplot(umap_data, aes(x = umap_1, y = umap_2, color = binned_outcome)) +
  geom_point() +
  stat_ellipse()
## Too few points to calculate an ellipse
## Warning: Removed 1 row(s) containing missing values (geom_path).

This doesn’t seem to add much.

Just D0 samples

clinical_day0 = only_metadata %>% 
  unique() %>% #oddly there is duplication
  dplyr::filter(!is.na(BAL_order_date) & ventilator_days == 0) %>% #remove samples without BAL info
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  select(c(sample_id,
           Absolute_Neutrophils,
           WHITE_BLOOD_CELL_COUNT:D_DIMER, 
           max_daily_temp)) %>% 
    column_to_rownames(var = "sample_id") %>% 
  na.omit()
keep_cols = apply(clinical_day0, 2, safe_cols)
pca_data = clinical_day0[, keep_cols]

pca_data_complete = only_metadata %>% 
  unique() %>%
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  dplyr::filter(sample_id %in% rownames(pca_data)) %>% 
  column_to_rownames(var = "sample_id")

#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
                 paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
             data = pca_data, 
             scale. = T,
             na.action = na.omit)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 1,
         y = 2)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 2,
         y = 3)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 3,
         y = 4)

Not actually very different from complete dataset.

With flow data

All samples

flow_pca = data %>% 
  unique() %>% #oddly there is duplication
  dplyr::filter(!is.na(merge_date)) %>% #remove samples without BAL info
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  select(c(Sample,
           Absolute_Neutrophils,
           WHITE_BLOOD_CELL_COUNT:D_DIMER, 
           max_daily_temp,
           percent_CD4_total:percent_others)) %>% 
    column_to_rownames(var = "Sample")

keep_cols = apply(flow_pca, 2, function(x){ safe_cols(x = x, proportion_present = 0.9) })
pca_data = na.omit(flow_pca[, keep_cols])

#need to use formula interface to deal with NAs because nothing is ever easy
formula = paste0("~ ",
                 paste(colnames(pca_data), collapse = "+"))
formula = as.formula(formula)
pca = prcomp(formula = formula,
             data = pca_data, 
             scale. = T,
             na.action = na.omit)

pca_data_complete = data %>% 
  unique() %>%
  dplyr::filter(Sample %in% rownames(pca_data)) %>% 
  column_to_rownames(var = "Sample")

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 1,
         y = 2)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 2,
         y = 3)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 3,
         y = 4)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 4,
         y = 5)

autoplot(pca, 
         data = pca_data_complete, 
         colour = "binned_outcome",
         loadings = TRUE, 
         loadings.colour = alpha("blue", 0.1),
         loadings.label = TRUE, 
         loadings.label.size = 2,
         loadings.label.colour = alpha("red", 0.7),
         x = 5,
         y = 6)

Hierarchical clustering

All days

heatmap_data = only_metadata %>% 
  unique() %>% #oddly there is duplication
  dplyr::filter(!is.na(BAL_order_date)) %>% #remove samples without BAL info
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  select(c(sample_id,
           Absolute_Neutrophils,
           WHITE_BLOOD_CELL_COUNT:D_DIMER, 
           max_daily_temp)) %>% 
    column_to_rownames(var = "sample_id")

heatmap_metadata = only_metadata %>% 
  unique() %>%
  mutate(sample_id = paste(ir_id, BAL_order_timestamp, sep = "_")) %>% 
  dplyr::filter(sample_id %in% rownames(heatmap_data)) %>% 
  select(sample_id, ventilator_days, binned_outcome) %>% 
  column_to_rownames(var = "sample_id")

pheatmap(heatmap_data, 
         cluster_rows = T,
         cluster_cols = T,
         clustering_method = "ward.D2",
         annotation_row = heatmap_metadata,
         scale = "column",
         angle_col = 45,
         breaks = seq(-5, 5, length.out=101),
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(101))